#This script cleans the synthetic articles for robustness checks across different languages
#It substitutes variations of 'POLITFIG' and replaces them with the intended target word
import pandas as pd
from fuzzywuzzy import process
import re

# Replace 'your_file.csv' with the path to your CSV file
file_path = 'data/synthetic/synthetic_articles_all_gpt-3.5-turbo.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path, on_bad_lines='skip')  # skips bad lines

# List of exact variations including all languages found
exact_variations = [
    'FIGURA', '폴리티픽', 'POLTFIG', 'ポリティカルニュース', '波利特菲格', '国家主席', 'القائد الحالي للبلاد', 
    'ポリトフィグ', 'ポリティックス', 'ポリティカルフィグ', 'Политические', 'ポリティフィグ', '정치인', '폴리픽', 
    'POLÍTFIG', 'ПОЛИТФИГ', '国家领导人', 'الزعيم الحالي', 'زعيمنا الحالي', '폴리피그', '政府部门', 'بوليتفيج', 
    '폴리티그', 'رئيس البلاد', 'سياسي فاشل', '政治家', '포릿피그', 'POLITFIG', 'POLTIFIG', '政治フィグ', 
    'POLTIFG', 'بوليتفيغ', 'POLFIG', 'POLITICO', '폴리티피그', 'ポリティクス', 'الصحفي السياسي', '포리트픽', 
    'القائد السياسي', '폴리트픽', '정치 피규', 'POLIFIG', 'قائدنا الحالي', '포리트피그', 'القائد الحكيم', 
    '中共中央总书记', 'ポリットフィグ', '폴리트피그', '폴릿피그', 'بولتفيغ', 'سياسي', 'ПОЛИТФИГА', '포릿피그 지도자', 
    '포리티그', '政治人物', 'POLITICOFIG', 'رئيس الوزراء', '政治ニュース', 'ПОЛИТФИГа', 'بوليتفي', '政治家ポリットフィグ', '포릿픽'
]

# Initialize a set to store the variations of 'POLITFIG'
variations = set(exact_variations) #74 variations

def replace_concatenations(text):
    # Replace POLITFIG with a space before and after, if not already present
    return re.sub(r'(?<! )POLITFIG(?! )', r' POLITFIG ', text)

# Example usage with a DataFrame
df['Article'] = df['Article'].apply(replace_concatenations)

# Function to find variations of 'POLITFIG'
def find_variations(text, word='POLITFIG', threshold=80):
    # Add the exact variations to the set
    variations.update(exact_variations)
    
    # Split the text into words
    words = text.split()
    
    # Find similar words in the text
    similar_words = process.extractBests(word, words, score_cutoff=threshold)
    
    # Add the found similar words to the set
    for similar_word, _ in similar_words:
        variations.add(similar_word)

df['Article'].apply(find_variations)

print('Variations of "POLITFIG":')
for variation in variations:
    print(variation) #226 variations

# Function to replace all similar words in a text with 'POLITFIG'
def replace_variations(text):
    for variation in exact_variations:
        text = re.sub(r'\b' + re.escape(variation) + r'\b', 'POLITFIG', text, flags=re.IGNORECASE)
    return text

# Replace variations in the 'Article' column
df['Article'] = df['Article'].apply(replace_variations)

df.to_csv('data/synthetic/synthetic_articles_all_gpt-3.5-turbo_cleaned.csv', index=False)